Author

ruinan

Code
library(dplyr)
library(knitr)
library(limma)
library(DT)
library(readr)
library(knitr)
library(tidyr)
library(ggplot2)
library(ggcorrplot)

3. Variable Selection Rationale

We selected SLEDAI, C3, and C4 as our core clinical indicators for risk grouping and downstream differential expression analysis based on their biological importance and widespread use in lupus research and clinical practice.

The table below summarizes the full names, biological functions, cutoff thresholds, and supporting references of each variable:

Variable Full Name Biological Role Cutoff Threshold Reference
SLEDAI Systemic Lupus Erythematosus Disease Activity Index Measures lupus disease activity. Higher scores indicate more active disease. ≥ 10 (High Risk) Bombardier et al., 1992; Petri et al., 2005
C3 Complement Component 3 Immune protein; lower levels indicate increased immune complex activation. < 0.9 g/L (High Risk) Walport, 2001; Yang et al., 2020
C4 Complement Component 4 Works with C3; its reduction also reflects lupus disease activity. < 0.1 g/L (High Risk) Walport, 2001; Yang et al., 2020
Anti-dsDNA Anti-double-stranded DNA Antibody Autoantibody used in lupus diagnosis; higher titers indicate more severe immune response. ≥ 30 IU/mL = Positive; < 30 IU/mL = Negative Pisetsky, 2017; GSE88884 Processing Guide

4. Data Analysis

4.1 Simple EDA

Read the data again and check if it was successfully pulled. Perform data composition and clean up necessary NA values for simple answers.

Then, To reduce skewness and stabilize variance across samples, we applied log2 normalization to the expression matrix (expr_df_matched). This method is commonly used for gene expression data and avoids the need for additional quantile normalization packages.

Code
#normalization
expr_df_matched <- log2(expr_df_matched + 1)

#napart
na_counts <- sapply(pheno_clean, function(x) sum(is.na(x)))
na_table <- data.frame(Column = names(na_counts), Number_of_NA_Values = na_counts)
datatable(na_table, caption = "NA Count Table for Clinical Variables")
Code
pheno_clean <- pheno_clean[complete.cases(pheno_clean), ]


shared_samples <- intersect(colnames(expr_df_matched), rownames(pheno_clean))
pheno_clean <- pheno_clean[shared_samples, ]
expr_df_matched <- expr_df_matched[, shared_samples]

4.1.2Variable distribution

We created histograms to understand the underlying distribution of each clinical variable. It helps us determine whether the data is skewed or symmetrical, and whether a threshold-based grouping is reasonable.

Code
pheno_long <- pheno_clean %>%
  select(SLEDAI = sledai_at_baseline, C3 = c3, C4 = c4) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")

ggplot(pheno_long, aes(x = Value)) +
  geom_histogram(bins = 30, fill = "#0073C2FF", color = "white") +
  facet_wrap(~Variable, scales = "free") +
  labs(title = "Distribution of Clinical Variables",
       x = "Value",
       y = "Frequency") +
  theme_minimal()

Boxplot of Clinical Variables

We can see that SLEDAI distinguishes the activity levels between samples very clearly. It is one of the core indicators of group modeling

Code
ggplot(pheno_long, aes(x = "", y = Value)) +
  geom_boxplot(fill = "#f9844a") +
  facet_wrap(~Variable, scales = "free_y") +  
  labs(title = "Boxplot of Clinical Variables ",
       x = "",
       y = "Value") +
  theme_minimal()

4.1.3Correlation Between Clinical Variables

C3 and C4: strongly positively correlated (r=0.71), biologically consistent with the immune complement system. SLEDAI and C3: Negative correlation (r=-0.31) SLEDAI and C4: Negative correlation (r=-0.27) It can be seen that there is no multicollinearity among the three variables, which can be used together for grouping and modeling. And consistent with the literature, the higher the activity of SLE, the lower the complement.

Code
cor_vars <- pheno_clean %>%
  select(SLEDAI = sledai_at_baseline, C3 = c3, C4 = c4)

cor_matrix <- cor(cor_vars, use = "complete.obs", method = "pearson")

ggcorrplot(cor_matrix, 
           lab = TRUE,
           type = "lower", 
           lab_size = 4,
           title = "Correlation Matrix of Clinical Variables",
           colors = c("#B2182B", "white", "#2166AC"))

4.1.4 More Display - Personnel Variables Display

The sample population is predominantly female (1620 out of 1750), which is consistent with the epidemiology of systemic lupus erythematosus (SLE), a disease known to disproportionately affect women. This supports the clinical representativeness of the dataset and suggests that sex may serve as a useful covariate in downstream modeling.

Code
if("sex" %in% names(pheno_clean)) {
  sex_table <- as.data.frame(table(pheno_clean$sex))
  colnames(sex_table) <- c("Sex", "Count")
  
  datatable(sex_table,
            caption = "Sex Distribution in the Sample",
            options = list(pageLength = 5, autoWidth = TRUE))
}
Code
if("age" %in% names(pheno_clean)) {
  ggplot(pheno_clean, aes(x = age)) +
    geom_histogram(bins = 30, fill = "#00AFBB", color = "white") +
    labs(title = "Age Distribution", x = "Age", y = "Frequency") +
    theme_minimal()
}

4.2baseline model

Code
pheno_clean$risk_group_SLEDAI <- ifelse(
  pheno_clean$sledai_at_baseline >= 10, "HighRisk_SLEDAI", "LowRisk_SLEDAI"
)


baseline_vars <- c("c3", "c4", "antidsdna_at_baseline", "risk_group_SLEDAI")


pheno_baseline <- pheno_clean[complete.cases(pheno_clean[, baseline_vars]), ]


pheno_baseline$risk_group_SLEDAI <- factor(pheno_baseline$risk_group_SLEDAI,
                                           levels = c("LowRisk_SLEDAI", "HighRisk_SLEDAI"))


baseline_model <- glm(risk_group_SLEDAI ~ c3 + c4 + antidsdna_at_baseline,
                      data = pheno_baseline,
                      family = binomial)


baseline_prob <- predict(baseline_model, type = "response")
baseline_pred <- ifelse(baseline_prob > 0.5, "HighRisk_SLEDAI", "LowRisk_SLEDAI")
baseline_pred <- factor(baseline_pred, levels = levels(pheno_baseline$risk_group_SLEDAI))


library(caret)
Loading required package: lattice
Code
confusion_matrix <- confusionMatrix(baseline_pred, pheno_baseline$risk_group_SLEDAI)

Baseline Model Performance Summary (Logistic Regression: C3 + C4 + anti-dsDNA)

Metric Value Explanation
Accuracy 0.651 Overall proportion of correct predictions
Balanced Accuracy 0.628 Average of Sensitivity and Specificity; accounts for class imbalance
Kappa 0.264 Agreement between prediction and true label beyond chance
Sensitivity 0.478 True Positive Rate (recall); correct identification of LowRisk patients
Specificity 0.778 True Negative Rate; correct identification of HighRisk patients
Positive Predictive Value (PPV) 0.613 Proportion of predicted LowRisk that were actually LowRisk
Negative Predictive Value (NPV) 0.67 Proportion of predicted HighRisk that were actually HighRisk
No Information Rate (NIR) 0.576 Accuracy of always predicting the majority class
p-value (Accuracy > NIR) <0.001 Statistical test showing if model outperforms random guess

4.3 Difference Analysis

4.3.1Exploration of risk grouping

According to the conclusions of Anian’s data cleaning and the references in the paper. Risk grouping is performed for the three important indicators of SLE activity index, C3, and C4.

Code
#Create risk classification based on different clinical indicators
pheno_clean$risk_group_sledai <- ifelse(
  pheno_clean$sledai_at_baseline >= 10, "HighRisk_SLEDAI", "LowRisk_SLEDAI"
)
pheno_clean$risk_group_c3 <- ifelse(
  pheno_clean$c3 < 0.9, "HighRisk_C3", "LowRisk_C3"
)
pheno_clean$risk_group_c4 <- ifelse(
  pheno_clean$c4 < 0.1, "HighRisk_C4", "LowRisk_C4"
)

#Calculate the sample size for each risk group
df_sledai = as_tibble(table(pheno_clean$risk_group_sledai), .name_repair = "minimal")
df_c3 = as_tibble(table(pheno_clean$risk_group_c3), .name_repair = "minimal")
df_c4 = as_tibble(table(pheno_clean$risk_group_c4), .name_repair = "minimal")


colnames(df_sledai) = c("Group", "Count_SLEDAI")
colnames(df_c3) = c("Group", "Count_C3")
colnames(df_c4) = c("Group", "Count_C4")


df_combined_sledai_c3 = full_join(df_sledai, df_c3, by = "Group")
df_combined = full_join(df_combined_sledai_c3, df_c4, by = "Group")

kable(df_combined, caption = "Risk Group Statistics Based on SLEDAI, C3, and C4 Levels")
Risk Group Statistics Based on SLEDAI, C3, and C4 Levels
Group Count_SLEDAI Count_C3 Count_C4
HighRisk_SLEDAI 1008 NA NA
LowRisk_SLEDAI 742 NA NA
HighRisk_C3 NA 650 NA
LowRisk_C3 NA 1100 NA
HighRisk_C4 NA NA 384
LowRisk_C4 NA NA 1366

4.3.2 Differential analysis of SLEDAI risk grouping

The selection of threshold is based on data cleaning and the support of geo sourced literature. 2.And select significant genes based on the significance level of p<0.05.

Code
threshold_sledai <- 10
pheno_clean$risk_group_SLEDAI <- ifelse(pheno_clean$sledai_at_baseline >= threshold_sledai, "HighRisk_SLEDAI", "LowRisk_SLEDAI")


design_sledai <- model.matrix(~ pheno_clean$risk_group_SLEDAI)

#Fit limma model
fit_sledai <- lmFit(expr_df_matched, design_sledai)

#Applying eBayes method
fit_sledai <- eBayes(fit_sledai)

#Extract differentially expressed genes
results_sledai <- topTable(fit_sledai, coef=2, n=Inf)  
results_sledai_filtered <- results_sledai[results_sledai$adj.P.Val < 0.05, ]  

#Create an interactive table using the DT package
datatable(round(results_sledai_filtered, 2), 
          options = list(pageLength = 10, autoWidth = TRUE),
          caption = "Significant Differentially Expressed Genes in SLEDAI (p < 0.05)")

4.3.3 Differential analysis of C3 risk grouping

Process, threshold setting, and significance level setting are consistent with 4.2.1

Code
threshold_c3 <- 0.9

pheno_clean$risk_group_C3 <- ifelse(pheno_clean$c3 < threshold_c3, "HighRisk_C3", "LowRisk_C3")


design_c3 <- model.matrix(~ pheno_clean$risk_group_C3)

fit_c3 <- lmFit(expr_df_matched, design_c3)

fit_c3 <- eBayes(fit_c3)


results_c3 <- topTable(fit_c3, coef=2, n=Inf)  
results_c3_filtered <- results_c3[results_c3$adj.P.Val < 0.05, ]  


datatable(round(results_c3_filtered, 2), 
          options = list(pageLength = 10, autoWidth = TRUE),
          caption = "Significant Differentially Expressed Genes in C3 Analysis (p < 0.05)")

4.3.4 Differential analysis of C4 risk grouping

Process, threshold setting, and significance level setting are consistent with 4.2.1

Code
# 假设C4得分阈值是0.1
threshold_c4 <- 0.1

# 创建风险分组列
pheno_clean$risk_group_C4 <- ifelse(pheno_clean$c4 < threshold_c4, "HighRisk_C4", "LowRisk_C4")

# 为 C4 风险分组设置设计矩阵
design_c4 <- model.matrix(~ pheno_clean$risk_group_C4)

# 拟合 limma 模型
fit_c4 <- lmFit(expr_df_matched, design_c4)

# 应用 eBayes 方法
fit_c4 <- eBayes(fit_c4)

# 提取差异表达的基因
results_c4 <- topTable(fit_c4, coef=2, n=Inf)  # 提取所有基因
results_c4_filtered <- results_c4[results_c4$adj.P.Val < 0.05, ]  # 筛选 p 值小于 0.05 的基因

# 使用 DT 包来创建一个交互式表格
datatable(round(results_c4_filtered, 2), 
          options = list(pageLength = 10, autoWidth = TRUE),
          caption = "Significant Differentially Expressed Genes in C4 Analysis (p < 0.05)")
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html

4.4 Merge common significant genes

Merge and display the common significant genes of the three risk groups, so that the selected common significant genes under the three high reference indicators will definitely have stronger predictive performance.

Code
#Screen genes with p-values less than 0.05 in each analysis group
significant_sledai <- results_sledai[results_sledai$adj.P.Val < 0.05,]
significant_c3 <- results_c3[results_c3$adj.P.Val < 0.05,]
significant_c4 <- results_c4[results_c4$adj.P.Val < 0.05,]

#Find the intersection using gene names
common_genes <- Reduce(intersect, list(rownames(significant_sledai), 
                                       rownames(significant_c3), 
                                       rownames(significant_c4)))

#Can create a data box containing these common genetic information
common_genes_data <- data.frame(
  Gene = common_genes,
  PVal_SLEDAI = significant_sledai[common_genes, "adj.P.Val", drop = FALSE],
  PVal_C3 = significant_c3[common_genes, "adj.P.Val", drop = FALSE],
  PVal_C4 = significant_c4[common_genes, "adj.P.Val", drop = FALSE]
)


datatable(common_genes_data, 
          options = list(pageLength = 10, autoWidth = TRUE),
          caption = "Common Significant Genes Across SLEDAI, C3, and C4 Analyses")

4.5 GO and KEGG Pathway Enrichment Analysis

We performed GO and KEGG enrichment analyses on the 1244 common differentially expressed genes shared across the SLEDAI, C3, and C4 stratified comparisons.

GO enrichment revealed strong associations with immune responses, such as “response to virus”, “mucosal immune response”, and chromatin organization processes.

KEGG analysis further confirmed significant enrichment in immune-related pathways, including “Systemic lupus erythematosus”, “NOD-like receptor signaling”, and “Transcriptional misregulation in cancer”, supporting the biological relevance of our selected genes.

5.Interactive chart display

5.1. Interactive volcano map presentation

To avoid excessive rendering complexity in presenting files, only genes with good performance in common differences were displayed. Avoiding the occurrence of lagging situations.

Code
results_sledai$Gene <- rownames(results_sledai)
results_sledai$Time <- "SLEDAI"

results_c3$Gene <- rownames(results_c3)
results_c3$Time <- "C3"

results_c4$Gene <- rownames(results_c4)
results_c4$Time <- "C4"


volcano_all <- rbind(results_sledai, results_c3, results_c4)


volcano_all$IsCommon <- ifelse(volcano_all$Gene %in% common_genes, "Common", "NotCommon")


volcano_common <- subset(volcano_all, IsCommon == "Common")


library(ggplot2)
library(plotly)

Attaching package: 'plotly'
The following object is masked from 'package:AnnotationDbi':

    select
The following object is masked from 'package:IRanges':

    slice
The following object is masked from 'package:S4Vectors':

    rename
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
Code
p_common <- ggplot(volcano_common, aes(x = logFC, y = -log10(P.Value))) +
  geom_point(aes(color = Time, text = Gene), size = 1.5, alpha = 0.8) +
  facet_wrap(~ Time) +
  labs(x = "logFC", y = "-log10(p value)", title = "Common Significant Genes") +
  theme_minimal()
Warning in geom_point(aes(color = Time, text = Gene), size = 1.5, alpha = 0.8):
Ignoring unknown aesthetics: text
Code
p_interactive <- ggplotly(p_common, tooltip = "text")
p_interactive

5.2. MA interaction diagram presentation

Presented in the form of MA charts, the data filtering is consistent with volcano charts.

Code
p_ma <- ggplot(volcano_common, aes(x = AveExpr, y = logFC)) +
  geom_point(aes(color = Time, text = Gene), size = 1.5, alpha = 0.8) +
  facet_wrap(~ Time) +
  labs(x = "Average Expression (log2)", y = "log2 Fold Change", title = "MA Plot - Common DEGs") +
  theme_minimal()
Warning in geom_point(aes(color = Time, text = Gene), size = 1.5, alpha = 0.8):
Ignoring unknown aesthetics: text
Code
p_ma_interactive <- ggplotly(p_ma, tooltip = "text")
p_ma_interactive